Try using a Targetted Gene Correlation Network to identify the LFY1 and LFY2 networks
Install…
BiocManager::install(c("AnnotationDBi", "GO.db", "preprocessCore", "impute", "rrvgo", "ComplexHeatmap"))
BiocManager::install(c("sva", "GOSim"))
install.packages("MLmetrics")
remotes::install_local("~/Downloads/GOSim_1.40.0.tgz") # get download link from https://bioconductor.org/packages/3.18/bioc/html/GOSim.html
remotes::install_github('juanbot/CoExpNets')
# remotes::install_github("aliciagp/TGCN") # don't use this, use mine, see below!
use my modified version of TGCN
# remotes::install_local("~/git/TGCN/", force = TRUE)
# or from web:
remotes::install_github("https://github.com/jnmaloof/TGCN", ref = "dev")
Skipping install of 'TGCN' from a github remote, the SHA1 (6985dd91) has not changed since last install.
Use `force = TRUE` to force installation
library(TGCN)
library(tidyverse)
── Attaching core tidyverse packages ────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.0.4 ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ lubridate::%within%() masks IRanges::%within%()
✖ stringr::boundary() masks graph::boundary()
✖ dplyr::collapse() masks IRanges::collapse()
✖ dplyr::combine() masks Biobase::combine(), BiocGenerics::combine()
✖ dplyr::desc() masks IRanges::desc()
✖ tidyr::expand() masks S4Vectors::expand()
✖ dplyr::filter() masks stats::filter()
✖ dplyr::first() masks S4Vectors::first()
✖ dplyr::lag() masks stats::lag()
✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
✖ purrr::reduce() masks IRanges::reduce()
✖ dplyr::rename() masks S4Vectors::rename()
✖ lubridate::second() masks S4Vectors::second()
✖ lubridate::second<-() masks S4Vectors::second<-()
✖ dplyr::select() masks AnnotationDbi::select()
✖ dplyr::slice() masks IRanges::slice()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(ggpubr)
library(gridExtra)
Attaching package: ‘gridExtra’
The following object is masked from ‘package:dplyr’:
combine
The following object is masked from ‘package:Biobase’:
combine
The following object is masked from ‘package:BiocGenerics’:
combine
sample.description <- read.csv("../input/sample.description.csv")
load read counts. These should be log2 cpm or Voom transformed or something similar.
lcpm <- read.csv("../output/log2cpm.csv.gz", row.names = 1, check.names = FALSE)
rownames(lcpm) <- str_remove(rownames(lcpm), fixed(".v2"))
head(lcpm)
dim(lcpm)
[1] 26704 48
Get the GO info and convert to a list.
Looking at the example notebook on the github site, it looks like genes in rows, samples in columns
My genes of interest.
CrLFY1 <- "Ceric.33G031700"
CrLFY2 <- "Ceric.18G076300"
Subset data to build LFY1 and LFY2 networks. Want to find networks around expression of LFY1 or LFY2, so pull those out and also create matrices that don’t have them.
LFY1.expression <- lcpm[str_detect(rownames(lcpm), CrLFY1),] %>%
unlist() %>%
as.vector()
input_for_LFY1 <- lcpm[str_detect(rownames(lcpm), CrLFY1, negate = TRUE),]
LFY2.expression <- lcpm[str_detect(rownames(lcpm), CrLFY2),] %>%
unlist() %>%
as.vector()
input_for_LFY2 <- lcpm[str_detect(rownames(lcpm), CrLFY2, negate = TRUE),]
if(dir.exists("../output/TGCN_LFY1")) system("rm -r ../output/TGCN_LFY1")
if(!dir.exists("../output/TGCN_LFY1")) dir.create("../output/TGCN_LFY1")
r <- testAllCutoffs(exprData=input_for_LFY1,
target=LFY1.expression,
covs=NULL,
train.split=0.7,
nfolds=5,
t=10,
path="../output/TGCN_LFY1",
targetName="LFY1",
tissueName="ALL",
seed=3333,
cutoffs=10:1,
n=100,
m=10,
s=10,
minCor=0.3,
maxTol=3,
save=T,
overwrite=T,
approach="enrichment", # the approach selected to complete the seed modules
report=F, # if report=T, an automated report will be created
gene2GO=gene2GO.list,
cellTypeAnnotation=FALSE)
- Step 1: Create a linear regression model for each gene set of hubs based on their ratio of appearance
Apply LASSO 10 times for feature selection
Warning: No enrichment can pe performed - there are no feasible GO terms!Warning: No enrichment can pe performed - there are no feasible GO terms!Warning: No enrichment can pe performed - there are no feasible GO terms!
Number of hubs per ratio of appearance
cutoff10 cutoff9 cutoff8 cutoff7 cutoff6 cutoff5 cutoff4 cutoff3 cutoff2 cutoff1
1 1 2 4 5 10 11 15 40 150
Cutoffs selected are 8 7 6 5 4 3 2 1
Get final model and the cv error for cutoff 8
Get final model and the cv error for cutoff 7
Get final model and the cv error for cutoff 6
Get final model and the cv error for cutoff 5
Get final model and the cv error for cutoff 4
Get final model and the cv error for cutoff 3
Get final model and the cv error for cutoff 2
Get final model and the cv error for cutoff 1
- Step 2: represent train and test error per ratio of appearance model
- Step 3: creating a network for each ratio of appearance where the number of hubs is <=30
Step 3.1: TGCN creation for ratio of appearance 8
Step 3.2: TGCN characterization for cutoff 8
Warning: No enrichment can pe performed - there are no feasible GO terms!
Step 3.1: TGCN creation for ratio of appearance 7
Step 3.2: TGCN characterization for cutoff 7
Step 3.1: TGCN creation for ratio of appearance 6
Step 3.2: TGCN characterization for cutoff 6
Step 3.1: TGCN creation for ratio of appearance 5
Step 3.2: TGCN characterization for cutoff 5
Step 3.1: TGCN creation for ratio of appearance 4
Step 3.2: TGCN characterization for cutoff 4
Step 3.1: TGCN creation for ratio of appearance 3
Step 3.2: TGCN characterization for cutoff 3
r.lfy1$selectRatio$nHubs
r.lfy1$selectRatio$stats
Focus on 5 and 6
p <- lapply(r.lfy1$nets, function(cutoff) cutoff$GOenrich$plotStats)
ggarrange(p$c5 + theme(text=element_text(size=10)),
p$c6 + theme(text=element_text(size=10)),
ncol=2, nrow=1, common.legend=T, legend="bottom")
r.lfy1$nets$c5$net$moduleSizeSelectionPlot
r.lfy1$nets$c6$net$moduleSizeSelectionPlot
Correlation with trait, I assume
r.lfy1$nets$c5$net$plotCorr
r.lfy1$nets$c6$net$plotCorr
DT::datatable(r.lfy1$nets$c5$net$modules)
DT::datatable(r.lfy1$nets$c6$net$modules)
knitr::include_graphics("../output/TGCN_LFY1/results/LFY1_ALL_c5_TGCN_crossTabPlot.png")
knitr::include_graphics("../output/TGCN_LFY1/results/LFY1_ALL_c6_TGCN_crossTabPlot.png")
grid.arrange(r.lfy1$nets$c5$GOenrich$plotStats, r.lfy1$nets$c5$GOenrich$plotNterms, nrow=2)
grid.arrange(r.lfy1$nets$c6$GOenrich$plotStats, r.lfy1$nets$c6$GOenrich$plotNterms, nrow=2)
if(dir.exists("../output/TGCN_LFY2")) system("rm -r ../output/TGCN_LFY2")
if(!dir.exists("../output/TGCN_LFY2")) dir.create("../output/TGCN_LFY2")
r.lfy2 <- testAllCutoffs(exprData=input_for_LFY2,
target=LFY2.expression,
covs=NULL,
train.split=0.7,
nfolds=5,
t=10,
path="../output/TGCN_LFY2",
targetName="LFY2",
tissueName="ALL",
seed=3333,
cutoffs=10:1,
n=100,
m=10,
s=10,
minCor=0.3,
maxTol=3,
save=T,
overwrite=T,
approach="enrichment", # the approach selected to complete the seed modules
report=F, # if report=T, an automated report will be created
gene2GO=gene2GO.list,
cellTypeAnnotation=FALSE)
- Step 1: Create a linear regression model for each gene set of hubs based on their ratio of appearance
Apply LASSO 10 times for feature selection
Number of hubs per ratio of appearance
cutoff10 cutoff9 cutoff8 cutoff7 cutoff6 cutoff5 cutoff4 cutoff3 cutoff2 cutoff1
0 0 1 1 2 2 5 15 39 152
Cutoffs selected are 6 5 4 3 2 1
Get final model and the cv error for cutoff 6
Get final model and the cv error for cutoff 5
Get final model and the cv error for cutoff 4
Get final model and the cv error for cutoff 3
Get final model and the cv error for cutoff 2
Get final model and the cv error for cutoff 1
- Step 2: represent train and test error per ratio of appearance model
- Step 3: creating a network for each ratio of appearance where the number of hubs is <=30
Step 3.1: TGCN creation for ratio of appearance 6
Step 3.2: TGCN characterization for cutoff 6
Step 3.1: TGCN creation for ratio of appearance 5
Step 3.2: TGCN characterization for cutoff 5
Step 3.1: TGCN creation for ratio of appearance 4
Step 3.2: TGCN characterization for cutoff 4
Step 3.1: TGCN creation for ratio of appearance 3
Step 3.2: TGCN characterization for cutoff 3
r.lfy2$selectRatio$nHubs
r.lfy2$selectRatio$stats
Focus on 3
p <- lapply(r.lfy2$nets, function(cutoff) cutoff$GOenrich$plotStats)
ggarrange(p$c3 + theme(text=element_text(size=10)),
p$c4 + theme(text=element_text(size=10)),
ncol=2, nrow=1, common.legend=T, legend="bottom")
Correlation with trait, I assume
grid.arrange(r.lfy2$nets$c3$GOenrich$plotStats, r.lfy2$nets$c3$GOenrich$plotNterms, nrow=2)
This was for troubleshooting…
exprData=input_for_LFY2
target=LFY2.expression
covs=NULL
train.split=0.7
nfolds=5
t=10
path="../output/TGCN_LFY2"
targetName="LFY2"
tissueName="ALL"
seed=1234
cutoffs=10:1
n=100
m=10
s=10
minCor=0.3
maxTol=3
save=T
overwrite=T
approach="enrichment" # the approach selected to complete the seed modules
report=F # if report=T an automated report will be created
gene2GO=gene2GO.list
reduced=F
cellTypeAnnotation=F
load the networks…